Loading needed libraries
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dbplyr)
##
## Attaching package: 'dbplyr'
## The following objects are masked from 'package:dplyr':
##
## ident, sql
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(ranger)
library(DHARMa)
## This is DHARMa 0.4.1. For overview type '?DHARMa'. For recent changes, type news(package = 'DHARMa') Note: Syntax of plotResiduals has changed in 0.3.0, see ?plotResiduals for details
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ranger':
##
## importance
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
Loading the data
# Creating a list of the column names
col_names <- c("Date","Rented Bike Count","Hour", "Temperature", "Humidity", "Wind speed", "Visibility", "Dew point temperature", "Solar Radiation", "Rainfall", "Snowfall", "Seasons", "Holiday", "Functioning Day")
# creating the dataframe and affecting the column names
bike <- read.csv('SeoulBikeData.csv', header = FALSE, sep = ",", col.names = col_names, skip = 1)
bike
summary(bike$Rented.Bike.Count)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 191.0 504.5 704.6 1065.2 3556.0
# Print the mean and the variance of the output
sprintf("Variance of Y: %f", var(bike$Rented.Bike.Count))
## [1] "Variance of Y: 416021.733390"
sprintf("Mean of Y: %f", mean(bike$Rented.Bike.Count))
## [1] "Mean of Y: 704.602055"
checkWeekday <- function(dateNumber) {
if (dateNumber %in% c(6,7))
return (0)
else
return (1)
}
checkWeekEnd <- function(dateNumber) {
if (dateNumber %in% c(6,7))
return (1)
else
return (0)
}
checkHoliday <- function(holidayStatus) {
if (holidayStatus == "No Holiday")
return (0)
else
return (1)
}
# bike_with_time <- bike %>%
# mutate(Date = dmy(Date),
# Day = as.integer(wday(Date)),
# Weekday = sapply(X = Day, FUN = checkWeekday),
# WeekEnd = sapply(X = Day, FUN = checkWeekEnd),
# Holiday = sapply(X = Holiday, FUN = checkHoliday))
# To set the language in english for the dummy variables
Sys.setlocale("LC_TIME", "en_US.UTF-8")
## [1] "en_US.UTF-8"
bike_with_time <- bike %>%
mutate(Date = dmy(Date),
Day = as.integer(wday(Date)),
Month = as.character(month(Date, label= TRUE)))
bike_with_time <- bike_with_time %>%
dplyr::select(-Date)
bike_dummy <- dummyVars(Rented.Bike.Count~., data = bike_with_time)
bike_dummy <- predict(bike_dummy, newdata = bike_with_time)
bike_encoded <- as.data.frame.matrix(bike_dummy)
colnames(bike_encoded)[colnames(bike_encoded) == "HolidayNo Holiday"] <- "NoHoliday"
bike_encoded$Rented.Bike.Count <- bike$Rented.Bike.Count
# final dataframe encoded which will be split
bike_encoded
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(splitTools)
library(ModelMetrics)
##
## Attaching package: 'ModelMetrics'
## The following objects are masked from 'package:caret':
##
## confusionMatrix, precision, recall, sensitivity, specificity
## The following object is masked from 'package:base':
##
## kappa
# splitting of the data into a training set and a testing set
set.seed(23)
inds <- partition(bike_encoded$Rented.Bike.Count, p = c(train = 0.8, test = 0.2))
bike_train <- bike_encoded[inds$train,]
bike_test <- bike_encoded[inds$test,]
Poisson regression
Model
# poisson regression model
poisson_lm1 <- glm(Rented.Bike.Count ~., family = "poisson", data = bike_train)
summary(poisson_lm1)
##
## Call:
## glm(formula = Rented.Bike.Count ~ ., family = "poisson", data = bike_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -49.107 -8.745 -1.413 4.940 104.801
##
## Coefficients: (7 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.741e+00 8.239e-03 696.769 < 2e-16 ***
## Hour 4.283e-02 7.847e-05 545.749 < 2e-16 ***
## Temperature 1.364e-02 3.095e-04 44.062 < 2e-16 ***
## Humidity -1.489e-02 8.863e-05 -167.990 < 2e-16 ***
## Wind.speed 2.146e-02 5.058e-04 42.428 < 2e-16 ***
## Visibility 6.042e-05 1.163e-06 51.953 < 2e-16 ***
## Dew.point.temperature 2.747e-02 3.193e-04 86.031 < 2e-16 ***
## Solar.Radiation -7.856e-02 6.684e-04 -117.534 < 2e-16 ***
## Rainfall -5.002e-01 2.385e-03 -209.757 < 2e-16 ***
## Snowfall -1.080e-01 2.163e-03 -49.948 < 2e-16 ***
## SeasonsAutumn 7.815e-01 4.915e-03 159.007 < 2e-16 ***
## SeasonsSpring 8.990e-01 4.373e-03 205.594 < 2e-16 ***
## SeasonsSummer 9.320e-01 4.774e-03 195.216 < 2e-16 ***
## SeasonsWinter NA NA NA NA
## HolidayHoliday -2.139e-01 2.471e-03 -86.537 < 2e-16 ***
## NoHoliday NA NA NA NA
## Functioning.DayNo -1.808e+01 1.069e+01 -1.692 0.09062 .
## Functioning.DayYes NA NA NA NA
## Day 1.603e-02 2.254e-04 71.144 < 2e-16 ***
## MonthApr -1.502e-01 2.218e-03 -67.711 < 2e-16 ***
## MonthAug -5.911e-01 2.156e-03 -274.155 < 2e-16 ***
## MonthDec 2.455e-01 3.904e-03 62.875 < 2e-16 ***
## MonthFeb 1.087e-02 4.089e-03 2.659 0.00783 **
## MonthJan NA NA NA NA
## MonthJul -4.083e-01 2.014e-03 -202.707 < 2e-16 ***
## MonthJun NA NA NA NA
## MonthMar -3.368e-01 2.644e-03 -127.416 < 2e-16 ***
## MonthMay NA NA NA NA
## MonthNov 1.138e-01 3.161e-03 35.987 < 2e-16 ***
## MonthOct 2.467e-01 2.358e-03 104.630 < 2e-16 ***
## MonthSep NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 3991664 on 7006 degrees of freedom
## Residual deviance: 1155312 on 6983 degrees of freedom
## AIC: 1209023
##
## Number of Fisher Scoring iterations: 9
# creation of simulated residuals with DHARMa on the poisson regression model
simulationOutput <- simulateResiduals(fittedModel = poisson_lm1, plot = F)
# QQ plot
plotQQunif(simulationOutput)
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details

# Check for dispersion in the data
testDispersion(simulationOutput, plot = F)
##
## DHARMa nonparametric dispersion test via sd of residuals fitted vs.
## simulated
##
## data: simulationOutput
## dispersion = 189, p-value < 2.2e-16
## alternative hypothesis: two.sided
Quasipoisson Regression
Model
# Quasipoisson model
poisson_lm2 <- glm(Rented.Bike.Count ~., family = "quasipoisson", data = bike_train)
summary(poisson_lm2)
##
## Call:
## glm(formula = Rented.Bike.Count ~ ., family = "quasipoisson",
## data = bike_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -49.107 -8.745 -1.413 4.940 104.801
##
## Coefficients: (7 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.741e+00 3.276e+00 1.753 0.0797 .
## Hour 4.283e-02 3.120e-02 1.373 0.1699
## Temperature 1.364e-02 1.231e-01 0.111 0.9118
## Humidity -1.489e-02 3.524e-02 -0.423 0.6727
## Wind.speed 2.146e-02 2.011e-01 0.107 0.9150
## Visibility 6.042e-05 4.624e-04 0.131 0.8960
## Dew.point.temperature 2.747e-02 1.270e-01 0.216 0.8287
## Solar.Radiation -7.856e-02 2.657e-01 -0.296 0.7675
## Rainfall -5.002e-01 9.481e-01 -0.528 0.5978
## Snowfall -1.080e-01 8.600e-01 -0.126 0.9000
## SeasonsAutumn 7.815e-01 1.954e+00 0.400 0.6892
## SeasonsSpring 8.990e-01 1.738e+00 0.517 0.6051
## SeasonsSummer 9.320e-01 1.898e+00 0.491 0.6234
## SeasonsWinter NA NA NA NA
## HolidayHoliday -2.139e-01 9.826e-01 -0.218 0.8277
## NoHoliday NA NA NA NA
## Functioning.DayNo -1.808e+01 4.249e+03 -0.004 0.9966
## Functioning.DayYes NA NA NA NA
## Day 1.603e-02 8.960e-02 0.179 0.8580
## MonthApr -1.502e-01 8.819e-01 -0.170 0.8648
## MonthAug -5.911e-01 8.572e-01 -0.690 0.4905
## MonthDec 2.455e-01 1.552e+00 0.158 0.8743
## MonthFeb 1.087e-02 1.626e+00 0.007 0.9947
## MonthJan NA NA NA NA
## MonthJul -4.083e-01 8.008e-01 -0.510 0.6102
## MonthJun NA NA NA NA
## MonthMar -3.368e-01 1.051e+00 -0.320 0.7486
## MonthMay NA NA NA NA
## MonthNov 1.138e-01 1.257e+00 0.091 0.9279
## MonthOct 2.467e-01 9.376e-01 0.263 0.7924
## MonthSep NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasipoisson family taken to be 158070.4)
##
## Null deviance: 3991664 on 7006 degrees of freedom
## Residual deviance: 1155312 on 6983 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 9
Negative binomial Regression
Model
# Negative binomial (NB) model
poisson_lm3 <- glm.nb(formula = Rented.Bike.Count ~ . -Rainfall, link = "log", data = bike_train)
summary(poisson_lm3)
##
## Call:
## glm.nb(formula = Rented.Bike.Count ~ . - Rainfall, data = bike_train,
## link = "log", init.theta = 2.477575252)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.6769 -0.6781 -0.1236 0.3208 4.8027
##
## Coefficients: (7 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.239e+00 1.537e-01 47.105 < 2e-16 ***
## Hour 3.831e-02 1.245e-03 30.764 < 2e-16 ***
## Temperature -3.068e-02 6.129e-03 -5.006 5.56e-07 ***
## Humidity -3.255e-02 1.696e-03 -19.200 < 2e-16 ***
## Wind.speed -9.285e-03 8.590e-03 -1.081 0.279745
## Visibility 7.070e-05 1.925e-05 3.673 0.000239 ***
## Dew.point.temperature 8.287e-02 6.407e-03 12.934 < 2e-16 ***
## Solar.Radiation -3.795e-02 1.281e-02 -2.963 0.003042 **
## Snowfall -7.244e-02 1.784e-02 -4.061 4.88e-05 ***
## SeasonsAutumn 5.581e-01 6.930e-02 8.054 7.99e-16 ***
## SeasonsSpring 6.861e-01 5.882e-02 11.665 < 2e-16 ***
## SeasonsSummer 7.311e-01 6.647e-02 11.000 < 2e-16 ***
## SeasonsWinter NA NA NA NA
## HolidayHoliday -3.305e-01 3.675e-02 -8.993 < 2e-16 ***
## NoHoliday NA NA NA NA
## Functioning.DayNo -3.009e+01 4.335e+03 -0.007 0.994462
## Functioning.DayYes NA NA NA NA
## Day 2.692e-02 3.887e-03 6.926 4.33e-12 ***
## MonthApr -1.213e-01 4.024e-02 -3.014 0.002576 **
## MonthAug -6.214e-01 4.145e-02 -14.991 < 2e-16 ***
## MonthDec 2.752e-01 3.757e-02 7.324 2.41e-13 ***
## MonthFeb 9.685e-03 3.849e-02 0.252 0.801330
## MonthJan NA NA NA NA
## MonthJul -4.466e-01 4.032e-02 -11.076 < 2e-16 ***
## MonthJun NA NA NA NA
## MonthMar -2.375e-01 4.327e-02 -5.488 4.07e-08 ***
## MonthMay NA NA NA NA
## MonthNov 2.864e-01 5.393e-02 5.310 1.10e-07 ***
## MonthOct 3.402e-01 4.452e-02 7.641 2.16e-14 ***
## MonthSep NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(2.4776) family taken to be 1)
##
## Null deviance: 23433.6 on 7006 degrees of freedom
## Residual deviance: 7246.1 on 6984 degrees of freedom
## AIC: 96567
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 2.4776
## Std. Err.: 0.0406
##
## 2 x log-likelihood: -96518.6990
# Residual analysis of the NB-regression
simulationOutput <- simulateResiduals(fittedModel = poisson_lm3, plot = F, refit = F)
# QQ plot residuals for NB-regression
plotQQunif(simulationOutput)
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details

# Check the dispersion
testDispersion(simulationOutput)

##
## DHARMa nonparametric dispersion test via sd of residuals fitted vs.
## simulated
##
## data: simulationOutput
## dispersion = 0.52772, p-value < 2.2e-16
## alternative hypothesis: two.sided
testZeroInflation(simulationOutput)

##
## DHARMa zero-inflation test via comparison to expected zeros with
## simulation under H0 = fitted model
##
## data: simulationOutput
## ratioObsSim = 0.99968, p-value = 1
## alternative hypothesis: two.sided
Zero-inflated NB
# Zero-inflated NB regression model
zeroinfl_poisson <- zeroinfl(Rented.Bike.Count ~ . | Hour, data = bike_train,
dist = "negbin", link = "log")
## Warning in value[[3L]](cond): le système est numériquement singulier :
## conditionnement de la réciproque = 3.31183e-25FALSE
summary(zeroinfl_poisson)
##
## Call:
## zeroinfl(formula = Rented.Bike.Count ~ . | Hour, data = bike_train, dist = "negbin",
## link = "log")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -1.5135 -0.6140 -0.1200 0.3881 13.7655
##
## Count model coefficients (negbin with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.221e+00 NA NA NA
## Hour 3.937e-02 NA NA NA
## Temperature -2.342e-02 NA NA NA
## Humidity -2.944e-02 NA NA NA
## Wind.speed -7.120e-03 NA NA NA
## Visibility 6.929e-05 NA NA NA
## Dew.point.temperature 7.435e-02 NA NA NA
## Solar.Radiation -3.679e-02 NA NA NA
## Rainfall -1.108e-01 NA NA NA
## Snowfall -7.839e-02 NA NA NA
## SeasonsAutumn 4.203e-01 NA NA NA
## SeasonsSpring 5.120e-01 NA NA NA
## SeasonsSummer 6.550e-01 NA NA NA
## SeasonsWinter -8.709e-02 NA NA NA
## HolidayHoliday -3.858e-01 NA NA NA
## NoHoliday -5.117e-02 NA NA NA
## Functioning.DayNo -1.508e-12 NA NA NA
## Functioning.DayYes -4.832e-02 NA NA NA
## Day 2.687e-02 NA NA NA
## MonthApr -8.267e-02 NA NA NA
## MonthAug -6.563e-01 NA NA NA
## MonthDec 2.215e-01 NA NA NA
## MonthFeb -3.916e-02 NA NA NA
## MonthJan -4.780e-02 NA NA NA
## MonthJul -4.959e-01 NA NA NA
## MonthJun -3.744e-02 NA NA NA
## MonthMar -2.039e-01 NA NA NA
## MonthMay 5.920e-02 NA NA NA
## MonthNov 2.863e-01 NA NA NA
## MonthOct 3.441e-01 NA NA NA
## MonthSep 1.017e-02 NA NA NA
## Log(theta) 9.482e-01 NA NA NA
##
## Zero-inflation model coefficients (binomial with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.3909953 NA NA NA
## Hour 0.0003823 NA NA NA
##
## Theta = 2.581
## Number of iterations in BFGS optimization: 1
## Log-likelihood: -4.914e+04 on 34 Df
# Zero-inflated NB-regression predictions
y_pred_zeroinfl <- predict(zeroinfl_poisson, newdata = bike_test[,-31])
rmse(y_pred_zeroinfl, bike_test[, 31])
## [1] 447.1093
Comparisons of the 3 models (Poisson)
Predictions for poisson regressions
nb_col = length(bike_train)[1]
# Poisson regression predictions
y_pred_poisson <- predict(poisson_lm1,
newdata = bike_test[, 1: nb_col-1],
type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
# Quasi-poisson regression predictions
y_pred_quasipoisson <- predict(poisson_lm2,
newdata = bike_test[, 1:nb_col-1],
type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
# Negative binomial regression predictions
y_pred_nb <- predict(poisson_lm3,
newdata = bike_test[, 1:nb_col-1],
type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
Try Linear Regression (Bad idea)
model_lm <- lm(Rented.Bike.Count ~. -Visibility, data = bike_train)
summary(model_lm)
##
## Call:
## lm(formula = Rented.Bike.Count ~ . - Visibility, data = bike_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1465.52 -261.46 -46.93 205.98 1970.13
##
## Coefficients: (7 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 736.4365 96.0262 7.669 1.97e-14 ***
## Hour 26.7316 0.7946 33.643 < 2e-16 ***
## Temperature 12.3524 3.9453 3.131 0.00175 **
## Humidity -12.9014 1.0764 -11.986 < 2e-16 ***
## Wind.speed 25.5273 5.4512 4.683 2.88e-06 ***
## Dew.point.temperature 17.5542 4.1622 4.218 2.50e-05 ***
## Solar.Radiation -96.2550 8.1078 -11.872 < 2e-16 ***
## Rainfall -53.1698 4.3862 -12.122 < 2e-16 ***
## Snowfall 31.4723 11.4911 2.739 0.00618 **
## SeasonsAutumn 292.4838 38.8163 7.535 5.50e-14 ***
## SeasonsSpring 353.4829 35.7011 9.901 < 2e-16 ***
## SeasonsSummer 484.2193 40.2305 12.036 < 2e-16 ***
## SeasonsWinter NA NA NA NA
## HolidayHoliday -152.2853 23.1731 -6.572 5.33e-11 ***
## NoHoliday NA NA NA NA
## Functioning.DayNo -951.2204 28.4597 -33.423 < 2e-16 ***
## Functioning.DayYes NA NA NA NA
## Day 12.7636 2.4900 5.126 3.04e-07 ***
## MonthApr -149.0339 25.4825 -5.848 5.18e-09 ***
## MonthAug -541.3169 25.4016 -21.310 < 2e-16 ***
## MonthDec 69.4644 24.2270 2.867 0.00415 **
## MonthFeb -39.3966 24.8242 -1.587 0.11255
## MonthJan NA NA NA NA
## MonthJul -371.4566 25.1335 -14.779 < 2e-16 ***
## MonthJun NA NA NA NA
## MonthMar -247.5485 26.9442 -9.187 < 2e-16 ***
## MonthMay NA NA NA NA
## MonthNov 6.4561 29.3392 0.220 0.82584
## MonthOct 148.6621 26.0829 5.700 1.25e-08 ***
## MonthSep NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 413.4 on 6984 degrees of freedom
## Multiple R-squared: 0.5923, Adjusted R-squared: 0.591
## F-statistic: 461.1 on 22 and 6984 DF, p-value: < 2.2e-16
Predictions
# Linear regression predictions
y_pred_lm <- predict(model_lm, newdata = bike_test[, -nb_col], type = "response")
## Warning in predict.lm(model_lm, newdata = bike_test[, -nb_col], type =
## "response"): prediction from a rank-deficient fit may be misleading
Random Forest
Model
rf_model <- randomForest(Rented.Bike.Count ~ ., data = bike_train)
Predictions
# Random Forest model predictions
y_pred_rf <- predict(rf_model, newdata = bike_test[, -nb_col])
Comparisons of the models
sprintf("Poisson regression : %1.0f", rmse(bike_test[, nb_col], y_pred_poisson))
## [1] "Poisson regression : 371"
sprintf("Quasipoisson : %1.0f", rmse(bike_test[, nb_col], y_pred_quasipoisson))
## [1] "Quasipoisson : 371"
sprintf("Negative binomiale : %1.0f",rmse(bike_test[, nb_col], y_pred_nb))
## [1] "Negative binomiale : 428"
sprintf("Linear Regression : %1.0f",rmse(bike_test[, nb_col], y_pred_lm))
## [1] "Linear Regression : 416"
sprintf("Random Forest : %1.0f",rmse(bike_test[, nb_col], y_pred_rf))
## [1] "Random Forest : 187"
# We can see that the min is negative for LR
print("Linear Regression")
## [1] "Linear Regression"
summary(y_pred_lm)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -597.9 395.1 713.2 718.7 1089.0 1842.7
print("Poisson regression")
## [1] "Poisson regression"
summary(y_pred_poisson)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 296.9 572.9 713.0 1075.0 2460.8
print("Quasipoisson regression")
## [1] "Quasipoisson regression"
summary(y_pred_quasipoisson)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 296.9 572.9 713.0 1075.0 2460.8
print("Negative binomial regression")
## [1] "Negative binomial regression"
summary(y_pred_nb)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 317.2 573.2 722.6 1079.0 5368.1
print("Random Forest")
## [1] "Random Forest"
summary(y_pred_rf)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 238.8 539.8 708.1 1079.4 2860.3
Results
sprintf("Poisson regression : %.3f", rmsle(bike_test[, nb_col], y_pred_poisson))
## [1] "Poisson regression : 0.733"
sprintf("Quasipoisson : %.3f", rmsle(bike_test[, nb_col], y_pred_quasipoisson))
## [1] "Quasipoisson : 0.733"
sprintf("Negative binomiale : %.3f",rmsle(bike_test[, nb_col], y_pred_nb))
## [1] "Negative binomiale : 0.811"
sprintf("Random Forest : %.3f",rmsle(bike_test[, nb_col], y_pred_rf))
## [1] "Random Forest : 0.693"
# number of features
n_features <- nb_col
# tuning grid
tuning_grid <- expand.grid(
trees = seq(10, 1000, by = 20),
rmse = NA
)
# Looping through the grid
for(i in seq_len(nrow(tuning_grid))) {
# Fit a random forest for each hyperparameter value for the number of trees
fit <- ranger(
formula = Rented.Bike.Count ~ .,
data = bike_train,
num.trees = tuning_grid$trees[i],
mtry = floor(n_features / 3),
respect.unordered.factors = 'order',
verbose = FALSE,
seed = 123
)
# Extract OOB RMSE
tuning_grid$rmse[i] <- sqrt(fit$prediction.error)
}
ggplot(tuning_grid, aes(trees, rmse)) +
geom_line(size = 1) +
ylab("OOB Error (RMSE)") +
xlab("Number of trees")

# tuning grid
tuning_grid <- expand.grid(
trees = seq(10, 1000, by = 10),
mtry = c(5, 10, 15, 20, 21, 30),
rmse = NA
)
# Looping through the grid
for(i in seq_len(nrow(tuning_grid))) {
# Fit a random forest for each nb of trees and mtry values
fit <- ranger(
formula = Rented.Bike.Count ~ .,
data = bike_train,
num.trees = tuning_grid$trees[i],
mtry = tuning_grid$mtry[i],
respect.unordered.factors = 'order',
verbose = FALSE,
seed = 23
)
# Extract OOB RMSE
tuning_grid$rmse[i] <- sqrt(fit$prediction.error)
}
labels <- tuning_grid %>%
filter(trees == 990) %>%
mutate(mtry = as.factor(mtry))
#Plot of the grid search
tuning_grid %>%
mutate(mtry = as.factor(mtry)) %>%
ggplot(aes(trees, rmse, color = mtry)) +
geom_line(size = 1, show.legend = T) +
ggrepel::geom_text_repel(data = labels, aes(trees, rmse, label = mtry), nudge_x = 50, show.legend = FALSE) +
ylab("OOB Error (RMSE)") +
xlab("Number of trees") +
labs(title = "Grid search on ntree and mtry")
